Van den Berge et al. recently showed that observation weights can facilitate application of standard RNA-seq differential expression tools such as DESeq and edgeR for single cell RNA-seq experiments. Our objective here is to show that cell weighting by the estimated dropout proportion can also facilitate the application of differential expression tools to scRNA-seq experiments. I’ll first start off with a simulation.

I will assume the following model. \[ \begin{aligned} y_{ij} &= \text{count of gene } j \text{ in cell } i; \notag \\ y_{ij} &\sim \pi_{ij} 1(0) + (1 - \pi_{ij}) \text{Pois}(\lambda_{ij}); \notag \\ \lambda_{ij} &\sim d*\phi_{ij}*\lambda_{0j}*\log \text{Normal} ( -0.5, 1^{2}); \notag \\ \phi_{ij} &= 5 \text{ for } 10\% \text{ of the genes in group 1 and } = 0.2 \text{ for } 10\% \text{ of the genes in group 1, and 0 otherwise}; \notag \\ \lambda_{0j} &\sim \text{Gamma}(\text{shape } = 0.1, \text{ rate } = 0.1); \notag \\ \theta_{i} &\sim \log \text{Normal}(-2.5^{2}/2, 2.5^2 ). \notag \\ d &= \text{ depth factor}; \notag \\ \pi_{ij} &= 1/(1 + \text{exp}(\log(1/\pi_{0j} - 1) + 0.5*(\phi_{i,j} - \bar{\phi}))) \notag \\ \pi_{0j} &\sim \text{Beta}(2, 4) \end{aligned} \]

set.seed(1)
n_cells = 1000
cellLabels = factor(c(paste0("treatment", 1:(n_cells/2)), paste0("control", (n_cells/2 + 1):n_cells)))
n_genes = 20000
theta = exp(rnorm(n_cells, -(2.5^2)/2, 2.5))
hist(theta, breaks = 50, col = "grey")

lambda0 = rgamma(n_genes, shape = 0.1, rate = 0.1)
hist(lambda0, breaks = 100, col = "grey")

phi = matrix(1, nrow = n_genes, ncol = n_cells)
dropout_baseline = rbeta(n_cells, 2, 4)
hist(dropout_baseline, breaks = 100)

dropout_baseline = -log(1/dropout_baseline - 1) # inverse of logistic function
depth_factor = 0.05
for(i in 1:(0.5*n_cells)){
  for(j in 1:n_genes){
    if(j <= 0.1*n_genes){
      phi[j, i] = 5
    }
    if(j <= 0.2*n_genes & j > 0.1*n_genes){
      phi[j, i] = 0.2
    }
  }
}
geneConditions = c(rep(1, times = 0.1*n_genes), 
                   rep(-1, times = 0.1*n_genes), 
                   rep(0, times = 0.8*n_genes))
phibar = mean(phi)
dropout_indicator = mat.or.vec(n_genes, n_cells)
dropout_chance = mat.or.vec(n_genes, n_cells)
for(i in 1:n_genes){
  for(j in 1:n_cells){
      dropout_chance[i,j] = 1/(1 + exp(-dropout_baseline[j] + (phi[i,j] - phibar)))
      dropout_indicator[i,j] = rbinom(1, 1, dropout_chance[i,j])
    }
}

hist(dropout_chance, breaks = 101)

counts = mat.or.vec(n_genes, n_cells)
colnames(counts) = cellLabels
rownames(counts) = paste0("gene", 1:n_genes)
for(i in 1:n_cells){
  for(j in 1:n_genes){
    if(dropout_indicator[j, i] == 0){
      #counts[j, i] = rnbinom(1, mu = depth_factor*phi[j, i]*lambda0[j]*theta[i], size = 0.1)
      counts[j, i] = rpois(1, depth_factor*phi[j, i]*lambda0[j]*theta[i]*exp(rnorm(1, -0.5, sd = 1)))
    }
  }
}
hist(apply(counts, 2, sum), breaks = 100)

hist(log(apply(counts, 2, sum)), breaks = 100)

mean(apply(counts, 2, sum))
## [1] 1307.174
max(counts)
## [1] 16556
#dropout_chance.vs.logcounts = data.frame(dropout_chance = c(dropout_chance), logcounts = log(c(counts + 1)))
#library(ggplot2)
#ggplot(dropout_chance.vs.logcounts[sample.int(dim(dropout_chance.vs.logcounts)[1], dim(dropout_chance.vs.logcounts)[1]/10, replace = FALSE), ], aes(x = dropout_chance, y = logcounts)) + geom_point(alpha = 0.5) + stat_smooth()

hist(apply(counts, 2, function(x) sum(x == 0)), breaks = 100)

cells2keep = which(colSums(counts) > 50)
n_cells = length(cells2keep)
n_cells
## [1] 436
genes2keep = intersect(which(rowSums(counts) > 10), which(rowSums((counts > 0)) > 3))
geneConditions = geneConditions[genes2keep]
n_genes = length(genes2keep)
n_genes
## [1] 5262
counts = counts[genes2keep, cells2keep]
dim(counts)
## [1] 5262  436
cellLabels = cellLabels[cells2keep]
#counts = counts[ , cells2keep]
dropout_indicator = dropout_indicator[genes2keep, cells2keep]
#dropout_indicator = dropout_indicator[ , cells2keep]
phi = phi[genes2keep, cells2keep]
true_dropout = colSums(phi == 0 | dropout_indicator == 1)/n_genes
hist(true_dropout, breaks = 100)
observed_dropout = colSums((counts == 0))/n_genes
library(ggplot2)

d = data.frame(true_dropout = true_dropout, observed_dropout = observed_dropout)
ggplot(d, aes(y = true_dropout, x = observed_dropout)) + geom_point(alpha = 0.8)

cor(true_dropout, observed_dropout)

preseq_estimated_dropout = rep(0, times = n_cells)
chao_estimated_dropout = rep(0, times = n_cells)
chaobunge_estimated_dropout = rep(0, times = n_cells)
chaolee_estimated_dropout = rep(0, times = n_cells)
#pnpmle_estimated_dropout = rep(0, times = n_cells)
breakaway_estimated_dropout = rep(0, times = n_cells)
jackknife_estimated_dropout = rep(0, times = n_cells)
negbinom_estimated_dropout = rep(0, times = n_cells)

depth = rep(0, times = n_cells)
library(SPECIES)
library(breakaway)
library(preseqR)
neg_binom_species <- function(y_hist){
  ztnb_fit = preseqR.ztnb.em(y_hist)
  return(sum(y_hist[,2])*(1 + 1/(1 - (1 + ztnb_fit$mu/ztnb_fit$size)^(-ztnb_fit$size))))
}
for(i in 1:n_cells){
    y = counts[,i]
    y = y[(y > 0)]
    y_unique = sort(unique(y), decreasing = FALSE)
    y_hist = data.frame(j = y_unique, n_j = sapply(y_unique, function(x) length(which(y == x))))
    write.table(y_hist, file = "y_hist.txt", sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
    if(dim(y_hist)[1] > 1){
    system("~/preseq/preseq bound_pop -H y_hist.txt -o preseq_estim.txt")
    preseq_estim = read.table(file = "preseq_estim.txt", header = TRUE)
    preseq_estimated_dropout[i] = 1 - preseq_estim$median_estimated_unobs/n_genes
    chao_estimated_dropout[i] = 1 - chao1984(y_hist)$Nhat/n_genes
    chaobunge_estimated_dropout[i] = 1 - ChaoBunge(y_hist)$Nhat/n_genes
    chaolee_estimated_dropout[i] = 1 - ChaoLee1992(y_hist, method = "ACE")$Nhat/n_genes
    #pnpmle_estimated_dropout[i] = 1 - pnpmle(y_hist, C=0, dis = 0)$Nhat/n_genes
    negbinom_estimated_dropout[i] = 1 - neg_binom_species(y_hist)/n_genes
    if(dim(y_hist)[1] == 6 & y_hist[6, 1] == 6){
      # need sufficient number of counts for breakaway
      breakaway_estimated_dropout[i] = 1 - breakaway(y_hist, plot = FALSE, print = FALSE, answers = TRUE)$est/n_genes
    }
    jackknife_estimated_dropout[i] = 1 - jackknife(y_hist)$Nhat/n_genes
    }
}
d = data.frame(d, preseq_estimated_dropout = preseq_estimated_dropout, 
               chao_estimated_dropout = chao_estimated_dropout, 
               chaolee_estimated_dropout = chaolee_estimated_dropout, 
               chaobunge_estimated_dropout = chaobunge_estimated_dropout, 
               #pnpmle_estimated_dropout = pnpmle_estimated_dropout, 
               breakaway_estimated_dropout = breakaway_estimated_dropout, 
               jackknife_estimated_dropout = jackknife_estimated_dropout,
               negbinom_estimated_dropout = negbinom_estimated_dropout,
               depth = colSums(counts))

d$preseq_estimated_dropout[which(d$preseq_estimated_dropout <= 0 | is.na(d$preseq_estimated_dropout))] = d$observed_dropout[which(d$preseq_estimated_dropout <= 0)]
#d$pnpmle_estimated_dropout[which(d$pnpmle_estimated_dropout <= 0)] = d$observed_dropout[which(d$pnpmle_estimated_dropout <= 0)]
d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout <= 0 | is.na(d$chaobunge_estimated_dropout))] = d$observed_dropout[which(d$chaobunge_estimated_dropout <= 0)]
## Warning in
## d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout <= :
## number of items to replace is not a multiple of replacement length
d$chao_estimated_dropout[which(d$chao_estimated_dropout <= 0 | is.na(d$chao_estimated_dropout))] = d$observed_dropout[which(d$chao_estimated_dropout <= 0)]
d$chaolee_estimated_dropout[which(d$chaolee_estimated_dropout <= 0 | is.na(d$chaolee_estimated_dropout))] = d$observed_dropout[which(d$chaolee_estimated_dropout <= 0)]
d$breakaway_estimated_dropout[which(d$breakaway_estimated_dropout <= 0 | is.na(d$breakaway_estimated_dropout))] = d$observed_dropout[which(d$breakaway_estimated_dropout <= 0)]
d$jackknife_estimated_dropout[which(d$jackknife_estimated_dropout <= 0 | is.na(d$jackknife_estimated_dropout))] = d$observed_dropout[which(d$jackknife_estimated_dropout <= 0)]
d$negbinom_estimated_dropout[which(d$negbinom_estimated_dropout <= 0 | is.na(d$negbinom_estimated_dropout))] = d$observed_dropout[which(d$negbinom_estimated_dropout <= 0)]
d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout > 1)] = 1
d$chaolee_estimated_dropout[which(d$chaolee_estimated_dropout > 1)] = 1

y = data.frame(algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"#, "pnpmle"
                             ), correlations = c(cor(d$depth, d$observed_dropout), 
                                                 cor(d$depth, d$chaolee_estimated_dropout),
                                                 cor(d$depth, d$breakaway_estimated_dropout),
                                                 cor(d$depth, d$chaobunge_estimated_dropout),
                                                 cor(d$depth, d$chao_estimated_dropout),
                                                 cor(d$depth, d$jackknife_estimated_dropout),
                                                 cor(d$depth, d$preseq_estimated_dropout),
                                                 cor(d$depth, d$negbinom_estimated_dropout)
                                                 #,cor(d$depth, d$pnpmle_estimated_dropout)
                                                 ))
y$algorithm = factor(y$algorithm, levels = c("observed","ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"#, "pnpmle"
                                             ))
ggplot(y, aes(x = algorithm, y = correlations)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

y
##   algorithm correlations
## 1  observed   -0.3813217
## 2       ACE   -0.2596501
## 3 breakaway   -0.3289994
## 4 ChaoBunge   -0.1934357
## 5      Chao   -0.2707138
## 6 jackknife   -0.2058381
## 7    preseq   -0.2112565
## 8  negbinom   -0.3088861
c = data.frame(correlations = c(cor(d$true_dropout, d$observed_dropout), 
                                cor(d$true_dropout, d$chaolee_estimated_dropout),
                                cor(d$true_dropout, d$breakaway_estimated_dropout),
                                cor(d$true_dropout, d$chaobunge_estimated_dropout),
                                cor(d$true_dropout, d$chao_estimated_dropout), 
                                cor(d$true_dropout, d$jackknife_estimated_dropout),
                                cor(d$true_dropout, d$preseq_estimated_dropout), 
                                cor(d$true_dropout, d$negbinom_estimated_dropout)
                                #, cor(d$true_dropout, d$pnpmle_estimated_dropout)
                                                                                                                                                                                                                                                                                                                                                                                           ), 
               algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"
#, "pnpmle"
))
c
##   correlations algorithm
## 1    0.2270614  observed
## 2    0.4025388       ACE
## 3    0.2034117 breakaway
## 4    0.0746364 ChaoBunge
## 5    0.4031629      Chao
## 6    0.2925732 jackknife
## 7    0.3471022    preseq
## 8    0.1446541  negbinom
meanSquareError = data.frame(meanSquareError = c(mean((d$true_dropout - d$observed_dropout)^2), 
                                                 mean((d$true_dropout - d$chaolee_estimated_dropout)^2),
                                                 mean((d$true_dropout - d$breakaway_estimated_dropout)^2),
                                                 mean((d$true_dropout - d$chaobunge_estimated_dropout)^2),
                                                 mean((d$true_dropout - d$chao_estimated_dropout)^2),
                                                 mean((d$true_dropout - d$jackknife_estimated_dropout)^2), 
                                                 mean((d$true_dropout - d$preseq_estimated_dropout)^2),
                                                 mean((d$true_dropout - d$negbinom_estimated_dropout)^2)),
                             
                             algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"))
meanSquareError$algorithm = factor(meanSquareError$algorithm, levels = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "negbinom", "preseq"))
meanSquareError
##   meanSquareError algorithm
## 1       0.3711687  observed
## 2       0.2378723       ACE
## 3       0.3632310 breakaway
## 4       0.3700155 ChaoBunge
## 5       0.2413438      Chao
## 6       0.2559039 jackknife
## 7       0.2165532    preseq
## 8       0.3408163  negbinom
ggplot(meanSquareError, aes(y = meanSquareError, x = algorithm)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

ggplot(c, aes(x = algorithm, y = correlations)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

ggplot(d, aes(x = observed_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = preseq_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = negbinom_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

#ggplot(d, aes(x = pnpmle_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)
ggplot(d, aes(x = chaolee_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = jackknife_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = chao_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = breakaway_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = chaobunge_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

sampleWeights = (1 - preseq_estimated_dropout)
hist(sampleWeights, breaks = 50, col = "grey")

library(edgeR)
## Warning: package 'edgeR' was built under R version 3.5.2
## Loading required package: limma
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
naiveDGE = DGEList(counts)
naiveDGE = calcNormFactors(naiveDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
naiveV = cpm(naiveDGE, log=TRUE, prior.count=3)
naiveFit = lmFit(naiveV, design=design)
naiveEB = eBayes(naiveFit, trend=TRUE, robust=TRUE)
decideTests(naiveEB)
## TestResults matrix
##        (Intercept) condition1
## gene4            1          0
## gene8            1          1
## gene10           1          1
## gene13           1          1
## gene15           1          0
## 5257 more rows ...
naiveEB.results = topTable(naiveEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(naiveEB.results)
## [1] 5262    6
head(naiveEB.results)
##              logFC  AveExpr        t      P.Value    adj.P.Val        B
## gene489  0.7250079 12.72025 9.119931 1.635843e-18 8.607806e-15 31.33298
## gene479  0.7441094 12.74522 8.873296 1.142117e-17 2.174486e-14 29.44287
## gene168  0.8022791 12.76054 8.862781 1.239730e-17 2.174486e-14 29.36313
## gene282  0.6847587 12.62025 8.777003 2.414043e-17 3.175674e-14 28.71521
## gene1146 0.5904708 12.55575 8.726298 3.571678e-17 3.758834e-14 28.33440
## gene1726 0.5506475 12.53994 8.612241 8.569685e-17 7.515614e-14 27.48378
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(naiveEB.results$logFC > 0)
## [1] 1497
sum(naiveEB.results$adj.P.Val < 0.05)
## [1] 941
sum(geneConditions > 0 & naiveEB.results$logFC > 0 & naiveEB.results$adj.P.Val < 0.05)
## [1] 430
sum(geneConditions > 0 & naiveEB.results$logFC > 0 & naiveEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.5874317
sum(geneConditions < 0)
## [1] 435
sum(naiveEB.results$logFC < 0)
## [1] 3765
sum(naiveEB.results$adj.P.Val < 0.05)
## [1] 941
sum(geneConditions < 0 & naiveEB.results$logFC < 0 & naiveEB.results$adj.P.Val < 0.05)
## [1] 155
sum(geneConditions < 0 & naiveEB.results$logFC < 0 & naiveEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.2117486
# corrected by dropout rate
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
correctedDGE = DGEList(counts)
correctedDGE = calcNormFactors(correctedDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
correctedV = cpm(correctedDGE, log=TRUE, prior.count=3)
correctedFit = lmFit(correctedV, design=design, weights = 1 - preseq_estimated_dropout)
correctedEB = eBayes(correctedFit, trend=TRUE, robust=TRUE)
decideTests(correctedEB)
## TestResults matrix
##        (Intercept) condition1
## gene4            1          0
## gene8            1          1
## gene10           1          1
## gene13           1          1
## gene15           1          0
## 5257 more rows ...
correctedEB.results = topTable(correctedEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(correctedEB.results)
## [1] 5262    6
head(correctedEB.results)
##              logFC  AveExpr        t      P.Value    adj.P.Val        B
## gene1146 0.8294813 12.55575 11.19792 5.125029e-26 2.696790e-22 48.26536
## gene168  1.1149151 12.76054 11.04620 1.981103e-25 5.212281e-22 46.94055
## gene489  0.9466060 12.72025 10.87410 9.066274e-25 1.590225e-21 45.45057
## gene282  0.9365995 12.62025 10.81169 1.568305e-24 2.063105e-21 44.91375
## gene1430 0.9072340 12.64168 10.32991 1.011474e-22 9.182279e-20 40.83350
## gene479  0.9681175 12.74522 10.32586 1.047010e-22 9.182279e-20 40.79970
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(correctedEB.results$logFC > 0)
## [1] 1802
sum(correctedEB.results$adj.P.Val < 0.05)
## [1] 1399
sum(geneConditions > 0 & correctedEB.results$logFC > 0 & correctedEB.results$adj.P.Val < 0.05)
## [1] 462
sum(geneConditions > 0 & correctedEB.results$logFC > 0 & correctedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.6311475
sum(geneConditions < 0)
## [1] 435
sum(correctedEB.results$logFC < 0)
## [1] 3460
sum(correctedEB.results$adj.P.Val < 0.05)
## [1] 1399
sum(geneConditions < 0 & correctedEB.results$logFC < 0 & correctedEB.results$adj.P.Val < 0.05)
## [1] 309
sum(geneConditions < 0 & correctedEB.results$logFC < 0 & correctedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.4221311
# corrected by dropout rate
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
AWcorrectedDGE = DGEList(counts)
AWcorrectedDGE = calcNormFactors(AWcorrectedDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
AWcorrectedV = cpm(AWcorrectedDGE, log=TRUE, prior.count=3)
aw = arrayWeights(AWcorrectedV, design)
AWcorrectedFit = lmFit(AWcorrectedV, design=design, weights = aw)
AWcorrectedEB = eBayes(AWcorrectedFit, trend=TRUE, robust=TRUE)
decideTests(AWcorrectedEB)
## TestResults matrix
##        (Intercept) condition1
## gene4            1          0
## gene8            1          0
## gene10           1          1
## gene13           1          0
## gene15           1          0
## 5257 more rows ...
AWcorrectedEB.results = topTable(AWcorrectedEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(AWcorrectedEB.results)
## [1] 5262    6
head(AWcorrectedEB.results)
##              logFC  AveExpr        t      P.Value    adj.P.Val        B
## gene1726 0.3846869 12.53994 9.520331 1.153496e-19 6.069696e-16 33.84367
## gene479  0.4269378 12.74522 8.465514 3.827358e-16 1.006978e-12 25.86449
## gene489  0.3812028 12.72025 8.024063 9.367140e-15 1.642996e-11 22.72370
## gene1350 0.3212715 12.55499 7.830323 3.666658e-14 4.823489e-11 21.38470
## gene123  0.4079216 12.70924 7.745864 6.597067e-14 6.942753e-11 20.80868
## gene633  0.4810947 12.84121 7.675749 1.070462e-13 9.387953e-11 20.33411
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(AWcorrectedEB.results$logFC > 0)
## [1] 2315
sum(AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 372
sum(geneConditions > 0 & AWcorrectedEB.results$logFC > 0 & AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 275
sum(geneConditions > 0 & AWcorrectedEB.results$logFC > 0 & AWcorrectedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.3756831
sum(geneConditions < 0)
## [1] 435
sum(AWcorrectedEB.results$logFC < 0)
## [1] 2947
sum(AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 372
sum(geneConditions < 0 & AWcorrectedEB.results$logFC < 0 & AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 0
sum(geneConditions < 0 & AWcorrectedEB.results$logFC < 0 & AWcorrectedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0
library(pander)
x = data.frame(p0.01 = c(sum(geneConditions != 0), 
                         sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01), 
                         sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),  
                         sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01)), 
               p0.05 = c(sum(geneConditions != 0), sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),  
                         sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05)),
               p0.1 = c(sum(geneConditions != 0), 
                        sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1), 
                        sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),  
                        sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1)))
rownames(x) = c("total", "naive", "preseq corrected", "AW corrected")
pander::pander(x)
  p0.01 p0.05 p0.1
total 1167 1167 1167
naive 402 585 768
preseq corrected 643 771 771
AW corrected 221 275 314
x = data.frame(p0.01 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01), 
                         sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),  
                         sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01)), 
               p0.05 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),  
                         sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05)),
               p0.1 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1), 
                        sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),  
                        sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected")
pander::pander(x)
  p0.01 p0.05 p0.1
naive 264 356 422
preseq corrected 360 628 948
AW corrected 42 97 143
ggplot(data.frame(arrayWeights = aw, preseqWeights = 1 - preseq_estimated_dropout), aes(x = arrayWeights, y = preseqWeights)) + geom_point() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

cor(aw, 1 - preseq_estimated_dropout)
## [1] -0.1920448
# observation weights with edgeR
# from https://github.com/statOmics/zinbwaveZinger/blob/master/timebenchmark/islam/benchmark_islam.Rmd
library(zinbwave)
## Warning: package 'zinbwave' was built under R version 3.5.2
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 3.5.2
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply
## Loading required package: SingleCellExperiment
## Warning: package 'SingleCellExperiment' was built under R version 3.5.2
## 
## Attaching package: 'SingleCellExperiment'
## The following object is masked from 'package:edgeR':
## 
##     cpm
library(edgeR)
# compute zinbwave weights
zinb = zinbwave::zinbFit(Y = counts, X = design, epsilon = 1e12)
weights = zinbwave::computeObservationalWeights(zinb, counts)
d = edgeR::DGEList(counts)
d = edgeR::calcNormFactors(d)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf

## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
d$weights = weights
d = edgeR::estimateDisp(d, design)
fit = edgeR::glmFit(d,design)
lrt = zinbwave::glmWeightedF(fit,coef=2, independentFiltering = TRUE)
pvals = lrt$table$padjFilter
#pvals = p.adjust(pvals, method = "BH")

sum(geneConditions > 0)
## [1] 732
sum(lrt$table$logFC > 0)
## [1] 1077
sum(pvals < 0.05)
## [1] 827
sum(geneConditions > 0 & lrt$table$logFC > 0 & pvals < 0.05)
## [1] 76
sum(geneConditions > 0 & lrt$table$logFC > 0 & pvals < 0.05)/sum(geneConditions > 0)
## [1] 0.1038251
sum(geneConditions > 0)
## [1] 732
sum(lrt$table$logFC < 0)
## [1] 4185
sum(pvals < 0.05)
## [1] 827
sum(geneConditions > 0 & lrt$table$logFC < 0 & pvals < 0.05)
## [1] 1
sum(geneConditions > 0 & lrt$table$logFC < 0 & pvals < 0.05)/sum(geneConditions > 0)
## [1] 0.00136612
# true positives
x = data.frame(p0.01 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01), 
                         sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),  
                         sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01),
                         sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.01)), 
               p0.05 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),  
                         sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.05)),
               p0.1 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1), 
                        sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),  
                        sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1),
                        sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected", "observation weights")
pander::pander(x)
  p0.01 p0.05 p0.1
naive 402 585 768
preseq corrected 643 771 771
AW corrected 221 275 314
observation weights 162 256 345
s = exp(seq(from = -8, to = -1, length = 101))
x = data.frame(algorithm = rep(c("naive", "preseq corrected", "AW corrected","observation weights"), each = length(s)), TruePositives = c(sapply(s, function(p) sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < p)) ), FDRthresh = rep(s, times = 4))
ggplot(x, aes(x = FDRthresh, y = TruePositives, col = algorithm)) + geom_line() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("True positives vs adjust p-value threshold")

x = data.frame(algorithm = rep(c("naive", "preseq corrected", "AW corrected","observation weights"), each = length(s)), FalsePositives = c(sapply(s, function(p) sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < p)) ), FDRthresh = rep(s, times = 4))
ggplot(x, aes(x = FDRthresh, y = FalsePositives, col = algorithm)) + geom_line() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("False positives vs adjust p-value threshold")

#False positives
x = data.frame(p0.01 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01), 
                         sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),  
                         sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01),
                         sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < 0.01)), 
               p0.05 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),  
                         sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05),
                         sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < 0.05)),
               p0.1 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1), 
                        sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),  
                        sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1),
                        sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected", "observation weights")
pander::pander(x)
  p0.01 p0.05 p0.1
naive 264 356 422
preseq corrected 360 628 948
AW corrected 42 97 143
observation weights 246 571 345